animelist.csv have the list of all animes register by the user with the respective score, watching status and numbers of episodes watched. This dataset contains 109 Million row, 17.562 different animes and 325.772 different users.
anime.csv contain general information of every anime (17.562 different anime) like genre, stats, studio, etc. This file have the following columns:
import pandas as pd
import numpy as np
anime = pd.read_csv("dataset/anime.csv")
anime.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 17562 entries, 0 to 17561 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 MAL_ID 17562 non-null int64 1 Name 17562 non-null object 2 Score 17562 non-null object 3 Genres 17562 non-null object 4 English name 17562 non-null object 5 Japanese name 17562 non-null object 6 Type 17562 non-null object 7 Episodes 17562 non-null object 8 Aired 17562 non-null object 9 Premiered 17562 non-null object 10 Producers 17562 non-null object 11 Licensors 17562 non-null object 12 Studios 17562 non-null object 13 Source 17562 non-null object 14 Duration 17562 non-null object 15 Rating 17562 non-null object 16 Ranked 17562 non-null object 17 Popularity 17562 non-null int64 18 Members 17562 non-null int64 19 Favorites 17562 non-null int64 20 Watching 17562 non-null int64 21 Completed 17562 non-null int64 22 On-Hold 17562 non-null int64 23 Dropped 17562 non-null int64 24 Plan to Watch 17562 non-null int64 25 Score-10 17562 non-null object 26 Score-9 17562 non-null object 27 Score-8 17562 non-null object 28 Score-7 17562 non-null object 29 Score-6 17562 non-null object 30 Score-5 17562 non-null object 31 Score-4 17562 non-null object 32 Score-3 17562 non-null object 33 Score-2 17562 non-null object 34 Score-1 17562 non-null object dtypes: int64(9), object(26) memory usage: 4.7+ MB
anime.head()
| MAL_ID | Name | Score | Genres | English name | Japanese name | Type | Episodes | Aired | Premiered | ... | Score-10 | Score-9 | Score-8 | Score-7 | Score-6 | Score-5 | Score-4 | Score-3 | Score-2 | Score-1 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Cowboy Bebop | 8.78 | Action, Adventure, Comedy, Drama, Sci-Fi, Space | Cowboy Bebop | カウボーイビバップ | TV | 26 | Apr 3, 1998 to Apr 24, 1999 | Spring 1998 | ... | 229170.0 | 182126.0 | 131625.0 | 62330.0 | 20688.0 | 8904.0 | 3184.0 | 1357.0 | 741.0 | 1580.0 |
| 1 | 5 | Cowboy Bebop: Tengoku no Tobira | 8.39 | Action, Drama, Mystery, Sci-Fi, Space | Cowboy Bebop:The Movie | カウボーイビバップ 天国の扉 | Movie | 1 | Sep 1, 2001 | Unknown | ... | 30043.0 | 49201.0 | 49505.0 | 22632.0 | 5805.0 | 1877.0 | 577.0 | 221.0 | 109.0 | 379.0 |
| 2 | 6 | Trigun | 8.24 | Action, Sci-Fi, Adventure, Comedy, Drama, Shounen | Trigun | トライガン | TV | 26 | Apr 1, 1998 to Sep 30, 1998 | Spring 1998 | ... | 50229.0 | 75651.0 | 86142.0 | 49432.0 | 15376.0 | 5838.0 | 1965.0 | 664.0 | 316.0 | 533.0 |
| 3 | 7 | Witch Hunter Robin | 7.27 | Action, Mystery, Police, Supernatural, Drama, ... | Witch Hunter Robin | Witch Hunter ROBIN (ウイッチハンターロビン) | TV | 26 | Jul 2, 2002 to Dec 24, 2002 | Summer 2002 | ... | 2182.0 | 4806.0 | 10128.0 | 11618.0 | 5709.0 | 2920.0 | 1083.0 | 353.0 | 164.0 | 131.0 |
| 4 | 8 | Bouken Ou Beet | 6.98 | Adventure, Fantasy, Shounen, Supernatural | Beet the Vandel Buster | 冒険王ビィト | TV | 52 | Sep 30, 2004 to Sep 29, 2005 | Fall 2004 | ... | 312.0 | 529.0 | 1242.0 | 1713.0 | 1068.0 | 634.0 | 265.0 | 83.0 | 50.0 | 27.0 |
5 rows × 35 columns
anime.shape
(17562, 35)
anime.describe()
| MAL_ID | Popularity | Members | Favorites | Watching | Completed | On-Hold | Dropped | Plan to Watch | |
|---|---|---|---|---|---|---|---|---|---|
| count | 17562.000000 | 17562.000000 | 1.756200e+04 | 17562.000000 | 17562.000000 | 1.756200e+04 | 17562.000000 | 17562.000000 | 17562.000000 |
| mean | 21477.192347 | 8763.452340 | 3.465854e+04 | 457.746270 | 2231.487758 | 2.209557e+04 | 955.049653 | 1176.599533 | 8199.831227 |
| std | 14900.093170 | 5059.327278 | 1.252821e+05 | 4063.473313 | 14046.688133 | 9.100919e+04 | 4275.675096 | 4740.348653 | 23777.691963 |
| min | 1.000000 | 0.000000 | 1.000000e+00 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 5953.500000 | 4383.500000 | 3.360000e+02 | 0.000000 | 13.000000 | 1.110000e+02 | 6.000000 | 37.000000 | 112.000000 |
| 50% | 22820.000000 | 8762.500000 | 2.065000e+03 | 3.000000 | 73.000000 | 8.175000e+02 | 45.000000 | 77.000000 | 752.500000 |
| 75% | 35624.750000 | 13145.000000 | 1.322325e+04 | 31.000000 | 522.000000 | 6.478000e+03 | 291.750000 | 271.000000 | 4135.500000 |
| max | 48492.000000 | 17565.000000 | 2.589552e+06 | 183914.000000 | 887333.000000 | 2.182587e+06 | 187919.000000 | 174710.000000 | 425531.000000 |
rating = pd.read_csv("dataset/animelist.csv")
rating.shape
(109224747, 5)
rating.head()
| user_id | anime_id | rating | watching_status | watched_episodes | |
|---|---|---|---|---|---|
| 0 | 0 | 67 | 9 | 1 | 1 |
| 1 | 0 | 6702 | 7 | 1 | 4 |
| 2 | 0 | 242 | 10 | 1 | 4 |
| 3 | 0 | 4898 | 0 | 1 | 1 |
| 4 | 0 | 21 | 10 | 1 | 0 |
#remove users who have watched 0 episodes
rating = rating[rating.watched_episodes!=0]
rating.shape
(77788510, 5)
# percent of data removed
(109224747-rating.shape[0])/109224747
0.28781240390513335
anime.duplicated().sum()
0
anime.isna().sum()
MAL_ID 0 Name 0 Score 0 Genres 0 English name 0 Japanese name 0 Type 0 Episodes 0 Aired 0 Premiered 0 Producers 0 Licensors 0 Studios 0 Source 0 Duration 0 Rating 0 Ranked 0 Popularity 0 Members 0 Favorites 0 Watching 0 Completed 0 On-Hold 0 Dropped 0 Plan to Watch 0 Score-10 0 Score-9 0 Score-8 0 Score-7 0 Score-6 0 Score-5 0 Score-4 0 Score-3 0 Score-2 0 Score-1 0 dtype: int64
rating.isna().sum()
user_id 0 anime_id 0 rating 0 watching_status 0 watched_episodes 0 dtype: int64
rating.duplicated().sum()
0
anime['Aired'].value_counts()
Unknown 309
2005 57
2004 49
2021 to ? 46
2003 46
...
May 21, 2014 1
Dec 24, 1978 1
Mar 20, 1985 1
May 1, 2015 to Jul 3, 2015 1
Mar 27, 2020 1
Name: Aired, Length: 11947, dtype: int64
import re
def extract_year(series):
if series != 'Unknown':
m = re.search(r'[0-9]{4}', series)
return m.group()
else:
return 0
anime['Year'] = anime['Aired'].map(extract_year).astype(int)
anime['Premiered'].value_counts()
Unknown 12817
Spring 2017 86
Fall 2016 79
Spring 2018 77
Spring 2016 74
...
Spring 1961 1
Summer 1974 1
Summer 1986 1
Winter 1974 1
Winter 1985 1
Name: Premiered, Length: 231, dtype: int64
anime['Type'].value_counts()
TV 4996 OVA 3894 Movie 3041 Special 2218 ONA 1907 Music 1469 Unknown 37 Name: Type, dtype: int64
anime['Season'] = anime['Premiered'].str.split(pat=' ', expand=True)[0]
anime['Season'].value_counts().reset_index()
| index | Season | |
|---|---|---|
| 0 | Unknown | 12817 |
| 1 | Spring | 1611 |
| 2 | Fall | 1389 |
| 3 | Winter | 942 |
| 4 | Summer | 803 |
anime['Score'].value_counts()
Unknown 5141
6.48 74
6.31 72
6.3 72
6.52 71
...
3.35 1
3.26 1
8.3 1
3.91 1
8.75 1
Name: Score, Length: 533, dtype: int64
#replace unknown scores with 0 (because we do not wanna lose info.
anime['Score'] = anime['Score'].replace('Unknown', 0).astype(float)
def split_labels(series):
series = series.split(",")
if "Unknown" in series:
series.remove("Unknown")
return series
anime["Genres"] = anime["Genres"].map(split_labels)
anime.head(3)
| MAL_ID | Name | Score | Genres | English name | Japanese name | Type | Episodes | Aired | Premiered | ... | Score-8 | Score-7 | Score-6 | Score-5 | Score-4 | Score-3 | Score-2 | Score-1 | Year | Season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Cowboy Bebop | 8.78 | [Action, Adventure, Comedy, Drama, Sci-Fi,... | Cowboy Bebop | カウボーイビバップ | TV | 26 | Apr 3, 1998 to Apr 24, 1999 | Spring 1998 | ... | 131625.0 | 62330.0 | 20688.0 | 8904.0 | 3184.0 | 1357.0 | 741.0 | 1580.0 | 1998 | Spring |
| 1 | 5 | Cowboy Bebop: Tengoku no Tobira | 8.39 | [Action, Drama, Mystery, Sci-Fi, Space] | Cowboy Bebop:The Movie | カウボーイビバップ 天国の扉 | Movie | 1 | Sep 1, 2001 | Unknown | ... | 49505.0 | 22632.0 | 5805.0 | 1877.0 | 577.0 | 221.0 | 109.0 | 379.0 | 2001 | Unknown |
| 2 | 6 | Trigun | 8.24 | [Action, Sci-Fi, Adventure, Comedy, Drama,... | Trigun | トライガン | TV | 26 | Apr 1, 1998 to Sep 30, 1998 | Spring 1998 | ... | 86142.0 | 49432.0 | 15376.0 | 5838.0 | 1965.0 | 664.0 | 316.0 | 533.0 | 1998 | Spring |
3 rows × 37 columns
anime_year = pd.DataFrame(anime.groupby('Year').count()['MAL_ID']).reset_index()
anime_year.head()
| Year | MAL_ID | |
|---|---|---|
| 0 | 0 | 309 |
| 1 | 1917 | 16 |
| 2 | 1918 | 7 |
| 3 | 1924 | 3 |
| 4 | 1925 | 6 |
import matplotlib.pyplot as plt
tmp = anime_year.drop(index=0)
tmp = tmp.sort_values('Year')
plt.plot(tmp['Year'], tmp['MAL_ID'],color = 'cornflowerblue')
plt.xlim(1910, 2020)
plt.show()
anime_top100 = anime.sort_values('Score', ascending=False).head(100)
anime_top100.head(3)
| MAL_ID | Name | Score | Genres | English name | Japanese name | Type | Episodes | Aired | Premiered | ... | Score-8 | Score-7 | Score-6 | Score-5 | Score-4 | Score-3 | Score-2 | Score-1 | Year | Season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3971 | 5114 | Fullmetal Alchemist: Brotherhood | 9.19 | [Action, Military, Adventure, Comedy, Dram... | Fullmetal Alchemist:Brotherhood | 鋼の錬金術師 FULLMETAL ALCHEMIST | TV | 64 | Apr 5, 2009 to Jul 4, 2010 | Spring 2009 | ... | 199160.0 | 70045.0 | 20210.0 | 9308.0 | 3222.0 | 1536.0 | 2162.0 | 16806.0 | 2009 | Spring |
| 15926 | 40028 | Shingeki no Kyojin: The Final Season | 9.17 | [Action, Military, Mystery, Super Power, D... | Attack on Titan Final Season | 進撃の巨人 The Final Season | TV | 16 | Dec 7, 2020 to ? | Winter 2021 | ... | 26016.0 | 8793.0 | 2674.0 | 1336.0 | 588.0 | 382.0 | 514.0 | 11061.0 | 2020 | Winter |
| 5683 | 9253 | Steins;Gate | 9.11 | [Thriller, Sci-Fi] | Steins;Gate | STEINS;GATE | TV | 24 | Apr 6, 2011 to Sep 14, 2011 | Spring 2011 | ... | 140914.0 | 57740.0 | 21375.0 | 11126.0 | 5061.0 | 2292.0 | 1678.0 | 5255.0 | 2011 | Spring |
3 rows × 37 columns
anime_top100_year = pd.DataFrame(anime_top100.groupby('Year').count()['MAL_ID']).reset_index()
anime_top100_year.sort_values('MAL_ID', ascending = False).head(5)
| Year | MAL_ID | |
|---|---|---|
| 23 | 2019 | 10 |
| 24 | 2020 | 9 |
| 21 | 2017 | 8 |
| 20 | 2016 | 8 |
| 22 | 2018 | 7 |
from matplotlib.pyplot import MultipleLocator
plt.figure(figsize=(10, 6))
plt.bar(anime_top100_year['Year'], anime_top100_year['MAL_ID'],color = 'cornflowerblue')
x_major_locator = MultipleLocator(5)
y_major_locator = MultipleLocator(1)
ax = plt.gca()
ax.xaxis.set_major_locator(x_major_locator)
ax.yaxis.set_major_locator(y_major_locator)
plt.xlim(1980, 2020)
plt.grid(color='grey', linestyle='dotted', linewidth=1)
plt.show()
from collections import defaultdict
all_genres = defaultdict(int)
for genres in anime['Genres']:
for genre in genres:
all_genres[genre.strip()] += 1
from wordcloud import WordCloud
genres_cloud = WordCloud(width=1200, height=800, background_color='white', colormap='Set2').generate_from_frequencies(all_genres)
plt.imshow(genres_cloud, interpolation='bilinear')
plt.axis('off')
(-0.5, 1199.5, 799.5, -0.5)
col = ['MAL_ID', 'Name','English name', 'Score', 'Genres', 'Type', 'Aired', 'Premiered','Rating','Source','Episodes','Dropped']
eda = anime[col]
typ = anime.sort_values(by= 'Popularity').groupby('Type')[['Name','Popularity']]
for i in eda['Type'].value_counts().index:
bar = typ.get_group(i)
bar.set_index('Name',inplace=True)
bar.head(5).plot(kind='barh',legend =False,color = 'cornflowerblue')
plt.xlabel([i])
plt.grid()
plt.show()
anime.sort_values('Score',ascending=False).head(5)
| MAL_ID | Name | Score | Genres | English name | Japanese name | Type | Episodes | Aired | Premiered | ... | Score-8 | Score-7 | Score-6 | Score-5 | Score-4 | Score-3 | Score-2 | Score-1 | Year | Season | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3971 | 5114 | Fullmetal Alchemist: Brotherhood | 9.19 | [Action, Military, Adventure, Comedy, Dram... | Fullmetal Alchemist:Brotherhood | 鋼の錬金術師 FULLMETAL ALCHEMIST | TV | 64 | Apr 5, 2009 to Jul 4, 2010 | Spring 2009 | ... | 199160.0 | 70045.0 | 20210.0 | 9308.0 | 3222.0 | 1536.0 | 2162.0 | 16806.0 | 2009 | Spring |
| 15926 | 40028 | Shingeki no Kyojin: The Final Season | 9.17 | [Action, Military, Mystery, Super Power, D... | Attack on Titan Final Season | 進撃の巨人 The Final Season | TV | 16 | Dec 7, 2020 to ? | Winter 2021 | ... | 26016.0 | 8793.0 | 2674.0 | 1336.0 | 588.0 | 382.0 | 514.0 | 11061.0 | 2020 | Winter |
| 5683 | 9253 | Steins;Gate | 9.11 | [Thriller, Sci-Fi] | Steins;Gate | STEINS;GATE | TV | 24 | Apr 6, 2011 to Sep 14, 2011 | Spring 2011 | ... | 140914.0 | 57740.0 | 21375.0 | 11126.0 | 5061.0 | 2292.0 | 1678.0 | 5255.0 | 2011 | Spring |
| 14963 | 38524 | Shingeki no Kyojin Season 3 Part 2 | 9.10 | [Action, Drama, Fantasy, Military, Mystery... | Attack on Titan Season 3 Part 2 | 進撃の巨人 Season3 Part.2 | TV | 10 | Apr 29, 2019 to Jul 1, 2019 | Spring 2019 | ... | 110481.0 | 33662.0 | 8365.0 | 2974.0 | 1108.0 | 550.0 | 385.0 | 4169.0 | 2019 | Spring |
| 9913 | 28977 | Gintama° | 9.10 | [Action, Comedy, Historical, Parody, Samur... | Gintama Season 4 | 銀魂° | TV | 51 | Apr 8, 2015 to Mar 30, 2016 | Spring 2015 | ... | 21360.0 | 10215.0 | 3898.0 | 2311.0 | 952.0 | 648.0 | 1100.0 | 4508.0 | 2015 | Spring |
5 rows × 37 columns
import plotly.graph_objects as go
labels = eda['Type'].value_counts().index
values = eda['Type'].value_counts().values
colors = ["ff9f1c","ffbf69","ffffff","cbf3f0","2ec4b6"]
fig = go.Figure(data=[go.Pie(labels=labels,
values=values)],layout=go.Layout(height=600, width=800))
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
marker=dict(colors=colors, line=dict(color='#000000', width=1)))
fig.update_layout(
title={
'text': "Medium of Streaming",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
import seaborn as sns
drop = eda[['Name', 'Dropped']].sort_values(by='Dropped', ascending=False).head(10)
plt.figure(figsize=(8, 4), dpi=100)
sns.barplot(x=drop['Name'], y=drop['Dropped'], data=drop, palette='Set2')
plt.xticks(rotation=40, ha='right')
plt.show()
labels = eda['Rating'].value_counts().index
values = eda['Rating'].value_counts().values
#colors = ["007f5f","2b9348","55a630","80b918","aacc00","bfd200","d4d700","dddf00","eeef20","ffff3f"]
fig = go.Figure(data=[go.Pie(labels=labels,
values=values)],layout=go.Layout(height=600, width=800))
fig.update_traces(hoverinfo='label+percent', textinfo='percent', textfont_size=20,
marker=dict(colors=colors, line=dict(color='#000000', width=1)))
fig.update_layout(
title={
'text': "Rating based Anime",
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
rating_count = rating['rating'].value_counts().sort_index()
plt.figure(figsize=(8, 6))
sns.barplot(x=rating_count.index,
y=rating_count.values,
palette='Set2').set_title('Comparison of the number of ratings from 0 to 10');
#rating.columns
# users who rated
len(rating.user_id.unique())
323575
# anime being rated
len(rating.anime_id.unique())
17383
# total anime
len(anime.MAL_ID.unique())
17562
(17562-17383)/17562
0.010192460995330829